
# Importing packages for Data Analysis
import numpy as np # Linear Algebra
import pandas as pd # Data Processing
# Importing packages for Data Visuals
import matplotlib.pyplot as plt
import seaborn as sns
import branca.colormap
%matplotlib inline
# Importing packages for Interactive Data Visuals
import folium
from folium.plugins import HeatMap
import folium.plugins as plugins
from folium import FeatureGroup, LayerControl, Map
from branca.colormap import LinearColormap
from branca.colormap import linear
from plotly import __version__
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()
from IPython.display import HTML
import plotly.express as px
# Importing Wordcloud for Visual
from wordcloud import WordCloud, STOPWORDS
import spacy
nlp = spacy.load('en_core_web_sm')
C:\Users\ADMIN\anaconda3\Lib\site-packages\paramiko\transport.py:219: CryptographyDeprecationWarning: Blowfish has been deprecated
# Importing natural language packages/library to process text
import re
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Downloading data for NLTK
nltk.download('stopwords', quiet=True)
nltk.download('opinion_lexicon', quiet=True)
nltk.download('punkt',quiet=True)
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\ADMIN\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
True
df = pd.read_csv("D:/SIBMB/SEM 3/TM/Grp Assignment/covidvaccine.csv")
df.head()
C:\Users\ADMIN\AppData\Local\Temp\ipykernel_16792\4079493533.py:1: DtypeWarning: Columns (5,6,7,12) have mixed types. Specify dtype option on import or set low_memory=False.
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MyNewsNE | Assam | MyNewsNE a dedicated multi-lingual media house... | 24-05-2020 10:18 | 64.0 | 11.0 | 110.0 | False | 18-08-2020 12:55 | Australia to Manufacture Covid-19 Vaccine and ... | ['CovidVaccine'] | Twitter Web App | False |
| 1 | Shubham Gupta | NaN | I will tell about all experiences of my life f... | 14-08-2020 16:42 | 1.0 | 17.0 | 0.0 | False | 18-08-2020 12:55 | #CoronavirusVaccine #CoronaVaccine #CovidVacci... | ['CoronavirusVaccine', 'CoronaVaccine', 'Covid... | Twitter for Android | False |
| 2 | Journal of Infectiology | NaN | Journal of Infectiology (ISSN 2689-9981) is ac... | 14-12-2017 07:07 | 143.0 | 566.0 | 8.0 | False | 18-08-2020 12:46 | Deaths due to COVID-19 in Affected Countries\n... | NaN | Twitter Web App | False |
| 3 | Zane | NaN | Fresher than you. | 18-09-2019 11:01 | 29.0 | 25.0 | 620.0 | False | 18-08-2020 12:45 | @Team_Subhashree @subhashreesotwe @iamrajchoco... | NaN | Twitter for Android | False |
| 4 | Ann-Maree O’Connor | Adelaide, South Australia | Retired university administrator. Melburnian b... | 24-01-2013 14:53 | 83.0 | 497.0 | 10737.0 | False | 18-08-2020 12:45 | @michellegrattan @ConversationEDU This is what... | NaN | Twitter Web App | False |
df.shape
(399645, 13)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 399645 entries, 0 to 399644 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_name 399626 non-null object 1 user_location 315133 non-null object 2 user_description 377791 non-null object 3 user_created 399619 non-null object 4 user_followers 399597 non-null float64 5 user_friends 399597 non-null object 6 user_favourites 399597 non-null object 7 user_verified 399597 non-null object 8 date 399595 non-null object 9 text 399597 non-null object 10 hashtags 337962 non-null object 11 source 397185 non-null object 12 is_retweet 399576 non-null object dtypes: float64(1), object(12) memory usage: 39.6+ MB
Missing_values = df.isnull().sum() / len(df) *100
Missing_values.iplot(kind='bar',title='Missing values in each columns in %',theme='white',color='#3DD8AD')
# Dropping rows with null values in the "text" and "user_name" column
df.dropna(subset=['text'], inplace=True)
df.dropna(subset=['user_name'], inplace=True)
# Resetting the index after dropping rows
df.reset_index(drop=True, inplace=True)
df.isnull().sum()
user_name 0 user_location 84476 user_description 21833 user_created 0 user_followers 0 user_friends 0 user_favourites 0 user_verified 0 date 2 text 0 hashtags 61621 source 2412 is_retweet 21 dtype: int64
# Taking care of nulls in location and description
df.user_location.fillna('Unknown', inplace=True)
df.user_description.fillna('Unknown', inplace=True)
df.source.fillna('Unknown', inplace=True)
df.hashtags.fillna('None', inplace=True)
# Verifying
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 399578 entries, 0 to 399577 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_name 399578 non-null object 1 user_location 399578 non-null object 2 user_description 399578 non-null object 3 user_created 399578 non-null object 4 user_followers 399578 non-null float64 5 user_friends 399578 non-null object 6 user_favourites 399578 non-null object 7 user_verified 399578 non-null object 8 date 399576 non-null object 9 text 399578 non-null object 10 hashtags 399578 non-null object 11 source 399578 non-null object 12 is_retweet 399557 non-null object dtypes: float64(1), object(12) memory usage: 39.6+ MB
# Cleaning the location column
# As both country and city is mentioned
df["country"] = df.user_location.apply(lambda x: x.split(",")[-1].strip() \
if ("," in x) else x)
df["city"] = df.user_location.apply(lambda x: x.split(",")[0].strip() \
if ("," in x) else x)
# Replacing 2 digit US states with USA except UK
df["country"] = df.country.apply(lambda x: 'USA' if len(x.lower().strip())<3 and x!='uk' else x)
# Standarizing case senstive cases
df["country"] = df.country.apply(lambda x: 'USA' if x.lower().strip() in \
("united states, usa, us") else x)
df["country"] = df.country.apply(lambda x: 'India' if x.lower().strip() in \
("india") else x)
# Removing countries from city column
df["city"] = df.city.apply(lambda x: 'Unknown' if x.lower() in \
['india', 'united states', 'united kingdom', 'uk', 'usa', 'us'] \
else x)
# Taking top 100 countries and cities excluding the unknown at index 0
top100_count = df.country.value_counts(sort=True, ascending=False)[1:101]
top100_count = pd.DataFrame(top100_count)
top100_city= df.city.value_counts(sort=True, ascending=False)[1:101]
top100_city = pd.DataFrame(top100_city)
def get_coordinates(input_type, name, output_as='center'):
"""
Function to get coordinates of country/ city
Attributes
----------
input_type : str
Pass 'country' or 'city' to generate the respective URL
name : str
Name of the country or city we need the coordinates for
output_as : str
Pass 'center' or 'boundingbox' depending upon what coordinates type to fetch
Methods
-------
Returns the coordinates of the country or city
"""
# create url
url = '{0}{1}{2}'.format('http://nominatim.openstreetmap.org/search?'+input_type+'='\
,name,'&format=json&polygon=0')
response = requests.get(url)
try:
response = response.json()[0]
# parse response to list
if output_as == 'center':
lst = [response.get(key) for key in ['lat','lon']]
output = [float(i) for i in lst]
if output_as == 'boundingbox':
lst = response[output_as]
output = [float(i) for i in lst]
return output
except (IndexError, ValueError):
# this will log the whole traceback
return [0,0]
import requests
# Getting the latitudes and longitudes for the top 100 countries and cities
lat_long_count = [get_coordinates("country",coun) for coun in top100_count.index]
lat_long_city = [get_coordinates("city",city) for city in top100_city.index]
# Calling the get_coodinates() and drop the duplicates from dataframe
def top100_df(input_type,df,lat_long):
# merge lat and long with original dataframe
for i, pair in zip(df.index, lat_long):
df.loc[i,'lat'] = pair[0]
df.loc[i,'long'] = pair[1]
# reset the index as country names
df.reset_index(level=0, inplace=True)
if input_type=="country":
df.rename(columns={"country":"# of tweets","index":"country"}, inplace=True)
elif input_type=="city":
df.rename(columns={"city":"# of tweets","index":"city"}, inplace=True)
# drop the countries/ cities with unidetified and duplicated latitudes and longitudes
df.drop_duplicates(subset=['lat','long'],inplace=True)
return df
# Calling the top100_df() to finalize the country and city dataframes
top100_count = top100_df("country",top100_count,lat_long_count)
top100_city = top100_df("city",top100_city,lat_long_city)
# Filter out rows with 'Unknown' in the 'country' column
top100_count_filtered = top100_count[top100_count['country'] != 'Unknown']
# Filter out rows with 'Unknown' in the 'city' column
top100_city_filtered = top100_city[top100_city['city'] != 'Unknown']
# Create a bar chart for tweets by country (filtered)
fig1 = px.bar(top100_count_filtered, x='country', y='# of tweets', color='country',
title='Tweets by Country', labels={'# of tweets': 'Number of Tweets'})
# Create a bar chart for tweets by city (filtered)
fig2 = px.bar(top100_city_filtered, x='city', y='# of tweets', color='city',
title='Tweets by City', labels={'# of tweets': 'Number of Tweets'})
# Update the layout for both figures
fig1.update_xaxes(tickangle=90)
fig2.update_xaxes(tickangle=90)
# Show the figures
fig1.show()
fig2.show()
import folium
def color(magnitude):
if magnitude >= 2000:
col = 'red'
elif 500 <= magnitude < 2000:
col = 'beige'
elif magnitude < 500:
col = 'green'
return col
def generateBaseMap(input_type, df, default_location=[40.693943, -73.985880], default_zoom_start=2):
"""
Function to generate the heatmap
Attributes
----------
input_type : str
Pass 'country' or 'city' to generate the respective heatmap
df : str
Name of the dataframe having the country/city coordinates and other details
default_location : int
Pass the default location for the displayed heatmap
default_zoom_start: int
Pass the default zoom for the displayed heatmap
Methods
-------
Returns the base_map
"""
base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
for lat, lan, tweet, name in zip(df['lat'], df['long'], df['# of tweets'], df.iloc[:, 0]):
# Create a custom icon using a custom image
twitter_icon = folium.CustomIcon(
icon_image="twitter-icon.png", # Path to your custom image
icon_size=(30, 30)
)
folium.Marker(location=[lat, lan], popup=[name, tweet], icon=twitter_icon).add_to(base_map)
# Heatmap layer should be added after adding markers
HeatMap(data=df[['lat', 'long']].values.tolist(), radius=20, max_zoom=13).add_to(base_map)
# Specify the min and max values of your data
min_val, max_val = df['# of tweets'].min(), df['# of tweets'].max()
colormap = folium.LinearColormap(['green', 'beige', 'red'], vmin=min_val, vmax=max_val)
colormap.caption = input_type.title() + ' distribution of COVID-19 tweets'
colormap.add_to(base_map)
return base_map
# Calling the function to generate the map
generateBaseMap('country', top100_count)
generateBaseMap('city',top100_city)
# Top 15 user name by number of tweets
import plotly.io as pio
pio.templates.default = "simple_white"
data_plot_Top15_user = df.groupby('user_name')['user_location'].count().reset_index()
data_plot_Top15_user = data_plot_Top15_user.sort_values(['user_location'])
data_plot_Top15_user = data_plot_Top15_user.tail(15)
fig = px.bar(data_plot_Top15_user, x='user_name', y='user_location',color='user_location',
labels={'user_name':'Users','user_location':'Number of tweets'},
title='Top 15 user by number of tweets',height=600)
fig.show()
import plotly.express as px
# Assuming you have already calculated top10_source as a DataFrame
top10_source = df.source.value_counts().nlargest(10).reset_index()
top10_source.columns = ['Source', '# of Tweets']
# Create an interactive bar chart using Plotly Express
fig = px.bar(
top10_source,
x='Source',
y='# of Tweets',
title='Tweets by Source',
labels={'# of Tweets': 'Number of Tweets'},
text='# of Tweets',
)
# Customize the appearance of the chart
fig.update_traces(
marker_color='lightblue',
marker_line_color='darkblue',
marker_line_width=1.5,
textposition='outside', # Display the number of tweets outside the bars
)
fig.update_xaxes(title_text='Source')
fig.update_yaxes(title_text='Number of Tweets')
fig.update_layout(
title={'text': 'Tweets by Source', 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
xaxis_title='Source',
yaxis_title='Number of Tweets',
xaxis_tickangle=-45, # Rotate x-axis labels for better readability
)
# Show the interactive chart
fig.show()
# Plotting hastags
comment_words = ''
stopwords = set(STOPWORDS)
# iterate through the csv file
for val in df['hashtags']:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 1800, height = 1200,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = '#40e0d0')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title('hashtags')
plt.show()
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
# Function to perform text preprocessing
def preprocess_text(text):
# Text transformation to lowercase
text = text.lower()
# URL removal from text
text = re.sub(r"http\S+", "", text)
# Punctuation removal from text
punctuation_removal = string.punctuation
text = text.translate(str.maketrans('', '', punctuation_removal))
# Single character and double space removal from the text
text = re.sub(r'\s+[a-zA-Z]\s+', '', text)
text = re.sub(r'\s+', ' ', text, flags=re.I)
# Stopword removal from text
STOPWORDS = set(stopwords.words('english'))
text = " ".join([word for word in text.split() if word not in STOPWORDS])
# Emoji Removal
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
text = str(text)
text = remove_emoji(text)
return text
# Apply the preprocessing function to the 'text' column of vaccine_data
df['text'] = df['text'].apply(preprocess_text)
# Display the resulting DataFrame
df['text']
0 australia manufacture covid19 vaccine give cit...
1 coronavirusvaccine coronavaccine covidvaccine ...
2 deaths due covid19 affected countries read rpi...
3 teamsubhashree subhashreesotwe iamrajchoco sta...
4 michellegrattan conversationedu passes leaders...
...
399573 billgates billgatesbioterrorist billgatesbillg...
399574 incredible pfizer boss backtracking vaccines s...
399575 sask social services admits vaxxing kids sans ...
399576 pharmaceutical industry would never lie us pro...
399577 running money covidvaccine covid19 covid covi...
Name: text, Length: 399578, dtype: object
# Apply the preprocessing function to a new column 'cleaned_text'
df['cleaned_text'] = df['text'].apply(preprocess_text)
# Tokenization function
def tokenization(text):
text = re.split('\W+', text)
return text
# Remove stopwords function
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
text = [word for word in text if word not in STOPWORDS]
return text
# Stemming functions
ps = PorterStemmer()
def stemming_porter(text):
text = [ps.stem(word) for word in text]
return text
ss = SnowballStemmer(language='english')
def stemming_snowball(text):
text = [ss.stem(word) for word in text]
return text
# Lemmatization function
wn = WordNetLemmatizer()
def lemmatization(text):
text = [wn.lemmatize(word) for word in text]
return text
# Apply tokenization, stopword removal, stemming, and lemmatization to 'cleaned_text'
df['tokenized'] = df['cleaned_text'].apply(tokenization)
df['No_stopwords'] = df['tokenized'].apply(remove_stopwords)
df['stemmed_porter'] = df['No_stopwords'].apply(stemming_porter)
df['stemmed_snowball'] = df['No_stopwords'].apply(stemming_snowball)
df['lemmatized'] = df['No_stopwords'].apply(lemmatization)
# Displaying the required columns
selected_columns = df[['text', 'cleaned_text', 'tokenized', 'No_stopwords', 'stemmed_porter', 'stemmed_snowball', 'lemmatized']]
selected_columns.head(10)
| text | cleaned_text | tokenized | No_stopwords | stemmed_porter | stemmed_snowball | lemmatized | |
|---|---|---|---|---|---|---|---|
| 0 | australia manufacture covid19 vaccine give cit... | australia manufacture covid19 vaccine give cit... | [australia, manufacture, covid19, vaccine, giv... | [australia, manufacture, covid19, vaccine, giv... | [australia, manufactur, covid19, vaccin, give,... | [australia, manufactur, covid19, vaccin, give,... | [australia, manufacture, covid19, vaccine, giv... |
| 1 | coronavirusvaccine coronavaccine covidvaccine ... | coronavirusvaccine coronavaccine covidvaccine ... | [coronavirusvaccine, coronavaccine, covidvacci... | [coronavirusvaccine, coronavaccine, covidvacci... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [coronavirusvaccine, coronavaccine, covidvacci... |
| 2 | deaths due covid19 affected countries read rpi... | deaths due covid19 affected countries read rpi... | [deaths, due, covid19, affected, countries, re... | [deaths, due, covid19, affected, countries, re... | [death, due, covid19, affect, countri, read, r... | [death, due, covid19, affect, countri, read, r... | [death, due, covid19, affected, country, read,... |
| 3 | teamsubhashree subhashreesotwe iamrajchoco sta... | teamsubhashree subhashreesotwe iamrajchoco sta... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [teamsubhashree, subhashreesotwe, iamrajchoco,... |
| 4 | michellegrattan conversationedu passes leaders... | michellegrattan conversationedu passes leaders... | [michellegrattan, conversationedu, passes, lea... | [michellegrattan, conversationedu, passes, lea... | [michellegrattan, conversationedu, pass, leade... | [michellegrattan, conversationedu, pass, leade... | [michellegrattan, conversationedu, pass, leade... |
| 5 | multisystem inflammatory syndromechildren misc... | multisystem inflammatory syndromechildren misc... | [multisystem, inflammatory, syndromechildren, ... | [multisystem, inflammatory, syndromechildren, ... | [multisystem, inflammatori, syndromechildren, ... | [multisystem, inflammatori, syndromechildren, ... | [multisystem, inflammatory, syndromechildren, ... |
| 6 | privilrodrigues yatish57 deepkaranahuja shrist... | privilrodrigues yatish57 deepkaranahuja shrist... | [privilrodrigues, yatish57, deepkaranahuja, sh... | [privilrodrigues, yatish57, deepkaranahuja, sh... | [privilrodrigu, yatish57, deepkaranahuja, shri... | [privilrodrigu, yatish57, deepkaranahuja, shri... | [privilrodrigues, yatish57, deepkaranahuja, sh... |
| 7 | msnbc well let’s qualify would anyone party ge... | msnbc well let’s qualify would anyone party ge... | [msnbc, well, let, s, qualify, would, anyone, ... | [msnbc, well, let, qualify, would, anyone, par... | [msnbc, well, let, qualifi, would, anyon, part... | [msnbc, well, let, qualifi, would, anyon, part... | [msnbc, well, let, qualify, would, anyone, par... |
| 8 | countries without ability make vaccines locall... | countries without ability make vaccines locall... | [countries, without, ability, make, vaccines, ... | [countries, without, ability, make, vaccines, ... | [countri, without, abil, make, vaccin, local, ... | [countri, without, abil, make, vaccin, local, ... | [country, without, ability, make, vaccine, loc... |
| 9 | dna zooms charts 1st week hear vaccines episod... | dna zooms charts 1st week hear vaccines episod... | [dna, zooms, charts, 1st, week, hear, vaccines... | [dna, zooms, charts, 1st, week, hear, vaccines... | [dna, zoom, chart, 1st, week, hear, vaccin, ep... | [dna, zoom, chart, 1st, week, hear, vaccin, ep... | [dna, zoom, chart, 1st, week, hear, vaccine, e... |
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Convert the 'text' column to strings
df['text'] = df['text'].astype(str)
# Define a function to get sentiment labels
def get_sentiment(text):
sentiment = analyzer.polarity_scores(text)
compound_score = sentiment['compound']
if compound_score >= 0.05:
return 'Positive'
elif compound_score <= -0.05:
return 'Negative'
else:
return 'Neutral'
# Add a new column 'Sentiments' based on sentiment analysis
df['Sentiments'] = df['text'].apply(get_sentiment)
# Displaying the required columns
selected_columns = df[['text', 'cleaned_text', 'tokenized', 'No_stopwords', 'stemmed_porter', 'stemmed_snowball', 'lemmatized', 'Sentiments']]
selected_columns.head(10)
| text | cleaned_text | tokenized | No_stopwords | stemmed_porter | stemmed_snowball | lemmatized | Sentiments | |
|---|---|---|---|---|---|---|---|---|
| 0 | australia manufacture covid19 vaccine give cit... | australia manufacture covid19 vaccine give cit... | [australia, manufacture, covid19, vaccine, giv... | [australia, manufacture, covid19, vaccine, giv... | [australia, manufactur, covid19, vaccin, give,... | [australia, manufactur, covid19, vaccin, give,... | [australia, manufacture, covid19, vaccine, giv... | Positive |
| 1 | coronavirusvaccine coronavaccine covidvaccine ... | coronavirusvaccine coronavaccine covidvaccine ... | [coronavirusvaccine, coronavaccine, covidvacci... | [coronavirusvaccine, coronavaccine, covidvacci... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [coronavirusvaccine, coronavaccine, covidvacci... | Positive |
| 2 | deaths due covid19 affected countries read rpi... | deaths due covid19 affected countries read rpi... | [deaths, due, covid19, affected, countries, re... | [deaths, due, covid19, affected, countries, re... | [death, due, covid19, affect, countri, read, r... | [death, due, covid19, affect, countri, read, r... | [death, due, covid19, affected, country, read,... | Negative |
| 3 | teamsubhashree subhashreesotwe iamrajchoco sta... | teamsubhashree subhashreesotwe iamrajchoco sta... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | Positive |
| 4 | michellegrattan conversationedu passes leaders... | michellegrattan conversationedu passes leaders... | [michellegrattan, conversationedu, passes, lea... | [michellegrattan, conversationedu, passes, lea... | [michellegrattan, conversationedu, pass, leade... | [michellegrattan, conversationedu, pass, leade... | [michellegrattan, conversationedu, pass, leade... | Neutral |
| 5 | multisystem inflammatory syndromechildren misc... | multisystem inflammatory syndromechildren misc... | [multisystem, inflammatory, syndromechildren, ... | [multisystem, inflammatory, syndromechildren, ... | [multisystem, inflammatori, syndromechildren, ... | [multisystem, inflammatori, syndromechildren, ... | [multisystem, inflammatory, syndromechildren, ... | Neutral |
| 6 | privilrodrigues yatish57 deepkaranahuja shrist... | privilrodrigues yatish57 deepkaranahuja shrist... | [privilrodrigues, yatish57, deepkaranahuja, sh... | [privilrodrigues, yatish57, deepkaranahuja, sh... | [privilrodrigu, yatish57, deepkaranahuja, shri... | [privilrodrigu, yatish57, deepkaranahuja, shri... | [privilrodrigues, yatish57, deepkaranahuja, sh... | Neutral |
| 7 | msnbc well let’s qualify would anyone party ge... | msnbc well let’s qualify would anyone party ge... | [msnbc, well, let, s, qualify, would, anyone, ... | [msnbc, well, let, qualify, would, anyone, par... | [msnbc, well, let, qualifi, would, anyon, part... | [msnbc, well, let, qualifi, would, anyon, part... | [msnbc, well, let, qualify, would, anyone, par... | Positive |
| 8 | countries without ability make vaccines locall... | countries without ability make vaccines locall... | [countries, without, ability, make, vaccines, ... | [countries, without, ability, make, vaccines, ... | [countri, without, abil, make, vaccin, local, ... | [countri, without, abil, make, vaccin, local, ... | [country, without, ability, make, vaccine, loc... | Negative |
| 9 | dna zooms charts 1st week hear vaccines episod... | dna zooms charts 1st week hear vaccines episod... | [dna, zooms, charts, 1st, week, hear, vaccines... | [dna, zooms, charts, 1st, week, hear, vaccines... | [dna, zoom, chart, 1st, week, hear, vaccin, ep... | [dna, zoom, chart, 1st, week, hear, vaccin, ep... | [dna, zoom, chart, 1st, week, hear, vaccine, e... | Neutral |
import plotly.express as px
# Count the number of instances for each sentiment category
sentiment_counts = df["Sentiments"].value_counts().reset_index()
# Rename the columns for clarity
sentiment_counts.columns = ["Sentiment", "Count"]
# Create an interactive pie chart using Plotly
fig = px.pie(sentiment_counts, values="Count", names="Sentiment", title="Sentiment Distribution")
fig.update_traces(textinfo='percent+label', pull=[0.1, 0.1, 0.1]) # Add percentage labels and separate a slice
# Show the interactive pie chart
fig.show()
# Combine all words from positive sentiments
positive_words = []
for words_list in df[df['Sentiments'] == 'Positive']['lemmatized']:
positive_words.extend(words_list)
# Create a space-separated string of positive words
positive_text = ' '.join(positive_words)
# Generate the word cloud with a title
wordcloud = WordCloud(
background_color='black',
colormap='viridis',
width=800,
height=400
)
wordcloud.generate(positive_text)
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Positive Tweets')
plt.axis('off')
plt.show()
# Function to generate and display a word cloud
def generate_word_cloud(sentiment):
words = []
for words_list in df[df['Sentiments'] == sentiment]['lemmatized']:
words.extend(words_list)
text = ' '.join(words)
wordcloud = WordCloud(
background_color='black',
colormap='viridis',
width=800,
height=400
).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud for {sentiment} Tweets')
plt.axis('off')
plt.show()
# Generate word clouds separately for negative and neutral sentiments
generate_word_cloud('Negative')
generate_word_cloud('Neutral')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
# Assuming 'text' column contains the text data in your DataFrame
sid = SentimentIntensityAnalyzer()
# Calculate sentiment scores for each text in the 'text' column
sentiment_scores = df['text'].apply(lambda x: sid.polarity_scores(x))
# Create a DataFrame from the sentiment scores
sent_scores_df = pd.DataFrame(list(sentiment_scores))
# Assuming 'date' column contains the date information in your DataFrame
sentiments_time_df = pd.DataFrame()
sentiments_time_df['time'] = df['date'] # Assuming 'date' column contains the date information
# Filter out non-date strings from the 'date' column
sentiments_time_df = sentiments_time_df[pd.to_datetime(sentiments_time_df['time'], errors='coerce').notnull()]
# Convert 'time' column to datetime
sentiments_time_df['time'] = pd.to_datetime(sentiments_time_df['time'])
# Set the DataFrame index to the 'time' column
sentiments_time_df.index = sentiments_time_df['time']
# Assuming 'compound' column contains the compound sentiment scores in your sent_scores_df DataFrame
# Create the plot using Matplotlib
fig = plt.figure(figsize=(20, 5))
ax = fig.add_subplot(111)
ax.plot(sentiments_time_df.index, sent_scores_df['compound'], label='Tweet Sentiment', color='y')
ax.set(title='Tweet Sentiments over Time', xlabel='Date', ylabel='Sentiment polarity')
ax.legend(loc='best')
fig.tight_layout()
# Show the plot
plt.show()
from afinn import Afinn
# Initialize the Afinn sentiment analyzer
afinn_scorer = Afinn()
# Join the lemmatized words into a single string
df['lemmatized_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))
# Apply the Afinn sentiment analysis to the joined text
df['score'] = df['lemmatized_text'].map(afinn_scorer.score).astype(int)
# Filter rows with non-zero sentiment scores
df_score = df[df['score'] != 0]
score_freq = df_score.score.value_counts().sort_index().to_frame('n')
score_freq
| n | |
|---|---|
| -46 | 1 |
| -32 | 1 |
| -29 | 1 |
| -25 | 1 |
| -24 | 2 |
| -23 | 1 |
| -22 | 6 |
| -21 | 13 |
| -20 | 8 |
| -19 | 11 |
| -18 | 16 |
| -17 | 19 |
| -16 | 66 |
| -15 | 50 |
| -14 | 85 |
| -13 | 105 |
| -12 | 232 |
| -11 | 282 |
| -10 | 539 |
| -9 | 865 |
| -8 | 1443 |
| -7 | 2000 |
| -6 | 3569 |
| -5 | 5605 |
| -4 | 9167 |
| -3 | 14495 |
| -2 | 28871 |
| -1 | 21281 |
| 1 | 52299 |
| 2 | 43806 |
| 3 | 25394 |
| 4 | 14745 |
| 5 | 8315 |
| 6 | 5359 |
| 7 | 2929 |
| 8 | 1760 |
| 9 | 968 |
| 10 | 587 |
| 11 | 323 |
| 12 | 203 |
| 13 | 98 |
| 14 | 63 |
| 15 | 43 |
| 16 | 14 |
| 17 | 14 |
| 18 | 11 |
| 19 | 4 |
| 20 | 3 |
| 21 | 2 |
| 22 | 3 |
| 24 | 1 |
| 30 | 2 |
score_freq.plot.bar(
legend=False,
figsize=(8, 4),
grid=True,
color='gray')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency of Words')
plt.title('The Iliad: Sentiment Scores by Words', loc='left')
plt.xticks(rotation=0);
df['line'] = range(1, len(df) + 1)
df.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | ... | cleaned_text | tokenized | No_stopwords | stemmed_porter | stemmed_snowball | lemmatized | Sentiments | lemmatized_text | score | line | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MyNewsNE | Assam | MyNewsNE a dedicated multi-lingual media house... | 24-05-2020 10:18 | 64.0 | 11.0 | 110.0 | False | 18-08-2020 12:55 | australia manufacture covid19 vaccine give cit... | ... | australia manufacture covid19 vaccine give cit... | [australia, manufacture, covid19, vaccine, giv... | [australia, manufacture, covid19, vaccine, giv... | [australia, manufactur, covid19, vaccin, give,... | [australia, manufactur, covid19, vaccin, give,... | [australia, manufacture, covid19, vaccine, giv... | Positive | australia manufacture covid19 vaccine give cit... | 1 | 1 |
| 1 | Shubham Gupta | Unknown | I will tell about all experiences of my life f... | 14-08-2020 16:42 | 1.0 | 17.0 | 0.0 | False | 18-08-2020 12:55 | coronavirusvaccine coronavaccine covidvaccine ... | ... | coronavirusvaccine coronavaccine covidvaccine ... | [coronavirusvaccine, coronavaccine, covidvacci... | [coronavirusvaccine, coronavaccine, covidvacci... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [coronavirusvaccin, coronavaccin, covidvaccin,... | [coronavirusvaccine, coronavaccine, covidvacci... | Positive | coronavirusvaccine coronavaccine covidvaccine ... | 3 | 2 |
| 2 | Journal of Infectiology | Unknown | Journal of Infectiology (ISSN 2689-9981) is ac... | 14-12-2017 07:07 | 143.0 | 566.0 | 8.0 | False | 18-08-2020 12:46 | deaths due covid19 affected countries read rpi... | ... | deaths due covid19 affected countries read rpi... | [deaths, due, covid19, affected, countries, re... | [deaths, due, covid19, affected, countries, re... | [death, due, covid19, affect, countri, read, r... | [death, due, covid19, affect, countri, read, r... | [death, due, covid19, affected, country, read,... | Negative | death due covid19 affected country read rpirya... | -3 | 3 |
| 3 | Zane | Unknown | Fresher than you. | 18-09-2019 11:01 | 29.0 | 25.0 | 620.0 | False | 18-08-2020 12:45 | teamsubhashree subhashreesotwe iamrajchoco sta... | ... | teamsubhashree subhashreesotwe iamrajchoco sta... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [teamsubhashre, subhashreesotw, iamrajchoco, s... | [teamsubhashree, subhashreesotwe, iamrajchoco,... | Positive | teamsubhashree subhashreesotwe iamrajchoco sta... | 1 | 4 |
| 4 | Ann-Maree O’Connor | Adelaide, South Australia | Retired university administrator. Melburnian b... | 24-01-2013 14:53 | 83.0 | 497.0 | 10737.0 | False | 18-08-2020 12:45 | michellegrattan conversationedu passes leaders... | ... | michellegrattan conversationedu passes leaders... | [michellegrattan, conversationedu, passes, lea... | [michellegrattan, conversationedu, passes, lea... | [michellegrattan, conversationedu, pass, leade... | [michellegrattan, conversationedu, pass, leade... | [michellegrattan, conversationedu, pass, leade... | Neutral | michellegrattan conversationedu pass leadershi... | 1 | 5 |
5 rows × 25 columns
score_acc = df_score.groupby(df['line'] // 100)\
.score.mean()\
.to_frame('score')\
.rename_axis('section')
score_acc.head(10)
| score | |
|---|---|
| section | |
| 0 | 0.615385 |
| 1 | 0.884615 |
| 2 | 0.428571 |
| 3 | 0.298246 |
| 4 | 0.807692 |
| 5 | 1.071429 |
| 6 | 0.578947 |
| 7 | 0.962963 |
| 8 | 1.446429 |
| 9 | 0.915254 |
ax = score_acc.plot.line(legend=False, figsize=(12, 6), grid=True, alpha=0.5, color='gray')
score_acc.rolling(10, min_periods=5).mean().plot.line(ax=ax, color='black')
plt.xlabel('Section of 100 Lines')
plt.ylabel('Mean Sentiment Score')
plt.title('The Iliad: Mean Sentiment Score by Section', loc='left')
plt.axhline(0, color='red')
plt.xticks(rotation=0);